Library Environment

suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(ISLR))
suppressMessages(library(caret))
suppressMessages(library(doMC))
suppressMessages(library(plotly))
suppressMessages(library(stringr))
registerDoMC(cores=4)

Load and processing data ctu13 cleaned

myData_cleaned <- read.csv('/home/jguerra/datasets/ctu13.labeled.cleaned', stringsAsFactors = F, sep = '|')
myData_cleaned.bkp = myData_cleaned
myData_cleaned
#Periodicity
myData_cleaned = myData_cleaned %>% mutate(strong_p = str_count(State,'[a-i]'))
myData_cleaned = myData_cleaned %>% mutate(weak_p = str_count(State,'[A-I]'))
myData_cleaned = myData_cleaned %>% mutate(weak_np = str_count(State,'[r-z]'))
myData_cleaned = myData_cleaned %>% mutate(strong_np = str_count(State,'[R-Z]'))
#Duration
myData_cleaned = myData_cleaned %>% mutate(duration_s = str_count(State,'(a|A|r|R|1|d|D|u|U|4|g|G|x|X|7)'))
myData_cleaned = myData_cleaned %>% mutate(duration_m = str_count(State,'(b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8)'))
myData_cleaned = myData_cleaned %>% mutate(duration_l = str_count(State,'(c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9)'))
#Size
myData_cleaned = myData_cleaned %>% mutate(size_s = str_count(State,'[a-c]') + str_count(State,'[A-C]') + str_count(State,'[r-t]') + str_count(State,'[R-T]') + str_count(State,'[1-3]'))
myData_cleaned = myData_cleaned %>% mutate(size_m = str_count(State,'[d-f]') + str_count(State,'[D-F]') + str_count(State,'[u-w]') + str_count(State,'[U-W]') + str_count(State,'[4-6]'))
myData_cleaned = myData_cleaned %>% mutate(size_l = str_count(State,'[g-i]') + str_count(State,'[G-I]') + str_count(State,'[x-z]') + str_count(State,'[X-Z]') + str_count(State,'[7-9]'))
#Periodicity %
myData_cleaned <- myData_cleaned %>% mutate(strong_p = (strong_p / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(weak_p = (weak_p / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(strong_np = (strong_np / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(weak_np = (weak_np / modelsize))
#Duration %
myData_cleaned <- myData_cleaned %>% mutate(duration_s = (duration_s / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(duration_m = (duration_m / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(duration_l = (duration_l / modelsize))
#Size %
myData_cleaned <- myData_cleaned %>% mutate(size_s = (size_s / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(size_m = (size_m / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(size_l = (size_l / modelsize))
#Making feature vectors
feature_vectors_cleaned = myData_cleaned[,c('strong_p','weak_p','weak_np','strong_np','duration_s','duration_m','duration_l','size_s','size_m','size_l','modelsize','label','class','port','proto')]
names(feature_vectors_cleaned) = c("sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","modelsize","class","subclass","port","proto")
feature_vectors_cleaned$class = factor(feature_vectors_cleaned$class)
feature_vectors_cleaned$subclass = factor(feature_vectors_cleaned$subclass)
feature_vectors_cleaned$proto = factor(feature_vectors_cleaned$proto)
feature_vectors_cleaned

Removing excesive Botnet and Normal class(Making the dataset more equitable)

feature_vectors_cleaned.bkp <- feature_vectors_cleaned
feature_vectors_cleaned %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
feature_vectors_cleaned_aux_botnet <- feature_vectors_cleaned %>% filter(class == 'Botnet-TCP-SMTP-Attempt-SPAM')
feature_vectors_cleaned_aux_normal <- feature_vectors_cleaned %>% filter(class == 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_botnet
feature_vectors_cleaned_aux_normal
feature_vectors_cleaned_aux_rest <- feature_vectors_cleaned %>% filter(class != 'Botnet-TCP-SMTP-Attempt-SPAM') %>% filter(class != 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_rest %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
aux1 <- rbind(feature_vectors_cleaned_aux_botnet[1:500,],feature_vectors_cleaned_aux_normal[1:500,])
aux1 %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
aux <- rbind(feature_vectors_cleaned_aux_rest,aux1)
aux %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
feature_vectors_cleaned <- aux

Create training set and testset

set.seed(212)
trainIndex <- createDataPartition(feature_vectors_cleaned$subclass, p=0.70, list=FALSE)
data_training <- feature_vectors_cleaned[ trainIndex,]
data_testing <- feature_vectors_cleaned[-trainIndex,]
#data_train = data_train %>% filter(length>5)
train <- upSample(x = data_training,  y = data_training$subclass, yname="class")
training <- train[,-c(11,16)]
testing <- data_testing[,-c(11)]
training
testing
nrow(training)
[1] 3322
nrow(feature_vectors_cleaned)
[1] 3089

Training configuration

ctrl_fast <- trainControl(method="cv", 
                     repeats=2,
                     number=10, 
                     summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=TRUE,
                     allowParallel = TRUE)  

Experiment 1

Creation of cluster and k parameters analysis

library(factoextra)
library(cluster)
library(NbClust)
feature_vector_training = training[,-c(11,12,13,14)]
# K-means clustering
set.seed(321)
#km.res <- kmeans(feature_vector_training, 3, nstart = 25)
km.res <- kmeans(feature_vector_training, 7, nstart = 25)
# k-means group number of each observation
km.res$cluster
   [1] 2 1 2 1 1 5 5 4 2 5 5 2 3 3 3 2 1 2 3 2 2 1 5 3 2 1 3 3 3 3 3 6 3 4 2 2 1 5 2 1 7 3 2 1 1 2 1 1 1 1 3 1 3 1 1 1 3 2 2 2 2 2 2 6 2 2 2 2 6 6 5 6 4 4 2 2
  [77] 2 4 4 2 2 2 2 3 6 4 4 2 2 4 4 6 6 2 2 3 4 2 2 2 4 2 2 6 4 3 2 5 2 2 6 2 2 2 2 4 4 2 2 2 5 7 3 6 3 5 6 2 2 3 4 2 6 5 2 4 4 3 2 6 4 4 4 4 3 5 2 4 4 4 4 4
 [153] 4 6 6 4 6 3 4 5 6 4 6 3 4 4 6 4 4 4 6 2 4 4 3 3 4 6 6 5 4 6 3 4 5 6 5 4 3 4 4 4 4 4 6 6 6 6 6 6 3 3 5 4 4 1 2 2 4 2 4 6 5 4 4 2 7 6 6 6 3 6 6 7 3 4 5 6
 [229] 7 4 5 2 6 5 3 1 1 1 6 4 6 4 6 6 5 6 6 6 6 6 6 6 7 5 7 5 1 3 2 1 2 1 2 1 1 1 1 2 2 3 2 5 5 5 1 7 7 7 7 1 3 1 1 2 3 2 2 3 3 1 1 1 1 2 2 1 3 1 1 2 2 3 4 3
 [305] 1 1 1 2 2 3 1 2 1 3 1 2 1 2 1 2 3 2 2 2 2 2 1 1 2 5 5 1 1 5 1 1 1 1 1 1 1 1 1 1 1 6 1 1 6 2 1 1 1 5 2 1 2 5 2 2 1 4 1 6 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [381] 1 1 1 1 1 1 1 1 1 3 2 2 1 1 3 2 1 2 2 2 2 5 1 3 5 3 2 1 2 1 2 2 1 1 1 1 3 1 2 1 5 2 2 1 1 1 3 3 1 1 2 2 2 1 3 2 2 1 1 3 1 2 6 2 6 5 5 5 5 2 2 2 2 2 2 3
 [457] 2 2 2 2 2 2 2 6 1 6 6 6 3 3 6 6 3 6 6 2 1 1 2 1 2 6 4 1 3 2 2 2 3 1 3 2 3 1 1 1 1 1 1 2 2 1 2 1 2 3 1 1 6 2 3 1 1 1 7 1 7 2 3 1 3 3 3 2 2 3 2 1 1 1 1 3
 [533] 1 2 1 2 5 7 4 3 3 3 1 1 2 2 1 2 3 3 2 2 2 1 1 3 1 5 1 3 2 1 1 1 5 1 2 2 3 2 2 2 3 3 3 3 1 2 1 2 2 2 3 3 2 3 1 3 1 1 1 2 1 1 1 1 3 3 3 2 2 1 2 2 2 2 1 1
 [609] 1 6 2 1 1 2 1 1 1 3 1 1 2 2 3 2 1 1 1 1 3 3 1 1 4 2 2 2 1 2 3 2 2 1 5 2 2 1 2 2 1 1 1 2 3 2 2 3 3 3 3 1 1 1 1 1 2 2 2 3 1 1 1 2 2 3 2 1 1 1 3 1 1 4 1 2
 [685] 3 3 2 1 3 3 1 1 1 2 3 5 1 1 1 5 2 2 2 3 1 3 3 1 2 1 1 2 2 2 3 2 1 3 3 1 7 2 3 1 3 2 7 1 2 1 1 2 1 1 1 2 2 3 1 3 1 1 2 1 1 1 3 1 3 2 1 1 1 3 2 1 3 2 4 1
 [761] 1 3 1 2 3 2 1 1 1 1 1 2 1 4 3 3 2 1 1 2 3 2 3 1 1 2 3 2 2 2 2 3 7 2 3 1 3 6 3 1 2 2 2 3 1 1 1 3 1 1 1 1 1 1 1 2 3 2 1 4 1 1 1 3 1 2 1 4 3 3 1 6 2 3 1 1
 [837] 2 1 2 1 1 1 2 1 1 1 1 1 4 1 2 1 7 1 1 1 1 1 2 2 1 1 1 2 1 1 2 2 3 1 1 2 1 2 2 1 1 2 3 3 3 2 3 3 1 1 1 1 2 1 1 2 2 2 1 1 3 1 1 1 1 2 1 3 3 2 2 2 2 1 2 1
 [913] 3 1 5 2 1 1 1 1 2 2 4 2 1 1 3 2 2 2 1 3 2 1 1 1 3 3 2 3 3 1 1 1 2 1 1 1 1 3 3 1 3 3 2 1 1 1 1 1 2 2 1 2 1 3 2 1 2 2 3 2 2 2 5 3 2 2 2 5 1 1 1 1 1 2 2 1
 [989] 1 1 1 3 1 1 1 3 1 2 1 3
 [ reached getOption("max.print") -- omitted 2322 entries ]
# Visualize k-means clusters
fviz_cluster(km.res, data = feature_vector_training, geom = "point",
             stand = FALSE, ellipse.type = "norm")

Elbow analysis

set.seed(321)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- feature_vector_training
wss <- sapply(1:k.max, 
        function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

Silhouette analysis

set.seed(322)
k.max <- 10
data <- feature_vector_training
nrow(data)
[1] 3322
sil <- rep(0, k.max)
# Compute the average silhouette width for 
# k = 2 to k = 15
for(i in 2:k.max){
  km.res <- kmeans(data, centers = i, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(data))
  sil[i] <- mean(ss[, 3])
}
# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

Useful functions

cold_start_data <- function(training.sampled,testing,settings){
  library(doParallel)
  cl <- makeCluster(2)
  registerDoParallel(cl)
  size_training <- nrow(training.sampled)
  split_size_training = size_training / 200
  testing_result = data.frame(numeric(nrow(testing)))
  
  count_random <- foreach(i=1:split_size_training) %dopar% {
    200 * i
  }
  metric <- numeric(split_size_training)
  metric_t <- numeric(split_size_training)
  #metric <- foreach(i=1:split_size_training) %do% {
  for(i in c(1:split_size_training)){
    #library(caret)
    #library(dplyr)
    count <- 200 * i
    aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
    clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
    aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
    result_vector <- numeric(nrow(testing))
    result_vector_trainning <- numeric(nrow(aux_training_set))
    
    for (j in c(1:3)){
      cluster_data <- dplyr::filter(aux_training_set_cluster, cluster == j)
      new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = cluster_data,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
      #Testing predict
      predsrfprobs <- predict(new_rfFit,testing,type='prob')
      
      for (k in c(1:length(result_vector))){
        if(predsrfprobs$botnet[k] > 0.5){
          result_vector[k] <- result_vector[k] + 1
        }
        else{
          result_vector[k] <- result_vector[k] - 1
        }
      }
      
      #Trainning predict
      predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
      for (k in c(1:length(result_vector_trainning))){
        if(predsrfprobs_t$botnet[k] > 0.5){
          result_vector_trainning[k] <- result_vector_trainning[k] + 1
        }
        else{
          result_vector_trainning[k] <- result_vector_trainning[k] - 1
        }
      }
    }
    a = ifelse(result_vector > 0,'botnet','normal')
    b <- ifelse(result_vector_trainning > 0,'botnet','normal')
    testing_result <- cbind(testing_result,'result' = result_vector)
    cm <- confusionMatrix(a,testing$subclass)
    metric[i] <- cm$byClass['F1']#cm$overall[1]
    
    cm_t <- confusionMatrix(b,aux_training_set$subclass)
    metric_t[i] <- cm_t$byClass['F1']
    #list('metric' = metric, 'metric_t' = metric_t)
  }
  output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
  output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
  list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}
cold_start_data_only_rf <- function(training.sampled,testing,settings){
  size_training <- nrow(training.sampled)
  split_size_training = size_training / 200
  testing_result = data.frame(numeric(nrow(testing)))
  
  count_random <- foreach(i=1:split_size_training) %dopar% {
    200 * i
  }
  metric <- numeric(split_size_training)
  metric_t <- numeric(split_size_training)
  #metric <- foreach(i=1:split_size_training) %do% {
  for(i in c(1:split_size_training)){
    #library(caret)
    #library(dplyr)
    count <- 200 * i
    aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
    new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = aux_training_set,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
    #Testing predict
    predsrfprobs <- predict(new_rfFit,testing,type='prob')
    predsrf <- ifelse(predsrfprobs$botnet >=0.5,'botnet','normal')
    cm <- confusionMatrix(predsrf,testing$subclass)
    metric[i] <- cm$byClass['F1']
    
    
    #Trainning predict
    predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
    predsrf_t <- ifelse(predsrfprobs_t$botnet >= 0.5,'botnet','normal')
    cm_t <- confusionMatrix(predsrf_t,aux_training_set$subclass)
    metric_t[i] <- cm_t$byClass['F1']
    
  }
  output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
  output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
  list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}
generate_data_noisy <- function(dataset, porcent){
  list_aux <- sample(nrow(dataset) ,porcent)
  noisy_data_sample <- dataset[list_aux,]
  no_noisy_data_sample <- dataset[-list_aux,]
  
  noisy_data_sample_b <- noisy_data_sample %>% filter(class == 'Botnet')
  noisy_data_sample_n <- noisy_data_sample %>% filter(class == 'Normal')
  
  noisy_data_sample_b$class <- as.character(noisy_data_sample_b$class)
  noisy_data_sample_b$class[noisy_data_sample_b$class == 'Botnet'] <- 'Normal'
  noisy_data_sample_b$class <- as.factor(noisy_data_sample_b$class)
  
  noisy_data_sample_n$class <- as.character(noisy_data_sample_n$class)
  noisy_data_sample_n$class[noisy_data_sample_n$class == 'Normal'] <- 'Botnet'
  noisy_data_sample_n$class <- as.factor(noisy_data_sample_n$class)
  
  noisy_data <- rbind(noisy_data_sample_b, noisy_data_sample_n)
  training_noisy <- rbind(no_noisy_data_sample,noisy_data)
  training_noisy <- training_noisy[sample(nrow(training_noisy),nrow(training_noisy)),]
  return(training_noisy)
}
get_ELA_measure <- function(A0, Ax){
  RLA <- (A0 - Ax) / A0
  FA0 <- (100 - A0) / A0
  ELA <- RLA + FA0
  return(ELA)
}
randomForest_performace <- function(training_data, testing_data){
  rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = training_data,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
  predsrfprobs <- predict(rfFit,testing_data,type='prob')
  predsrf <- ifelse(predsrfprobs$Botnet >=0.5,'Botnet','Normal')
  cm <- confusionMatrix(predsrf,testing_data$class)
  result <- cm$byClass
  return(result)
}
training
testing

Data training partitions: cold start study

Iteration #1

output_1 <- result$output
output_t_1 <- result$output_t
output_1

gg <- ggplot(data = output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

first_training_sample <- training.sampled_1[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_1 <- output_t_1
names(output_t_aux_1) <- c('data_count_t','metric_t')
output_result_1 <- cbind(output_1,output_t_aux_1)
gg <- ggplot(data = output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #2

output_2 <- result_2$output
output_t_2 <- result_2$output_t
output_2

gg <- ggplot(data = output_2)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)

first_training_sample <- training.sampled_2[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_2 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_2 <- output_t_2
names(output_t_aux_2) <- c('data_count_t','metric_t')
output_result_2 <- cbind(output_2,output_t_aux_2)
gg <- ggplot(data = output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #3

output_3 <- result_3$output
output_t_3 <- result_3$output_t
output_3

gg <- ggplot(data = output_3)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)

first_training_sample <- training.sampled_3[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_3 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_3 <- output_t_3
names(output_t_aux_3) <- c('data_count_t','metric_t')
output_result_3 <- cbind(output_3,output_t_aux_3)
gg <- ggplot(data = output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #4

output_4 <- result_4$output
output_t_4 <- result_4$output_t
output_4

gg <- ggplot(data = output_4)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)

first_training_sample <- training.sampled_4[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_4 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_4 <- output_t_4
names(output_t_aux_4) <- c('data_count_t','metric_t')
output_result_4 <- cbind(output_4,output_t_aux_4)
gg <- ggplot(data = output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #5

output_5 <- result_5$output
output_t_5 <- result_5$output_t
output_5

gg <- ggplot(data = output_5)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)

first_training_sample <- training.sampled_5[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_5 <- result$testing_result
#testing_result
gg <- ggplot(data = output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_5 <- output_t_5
names(output_t_aux_5) <- c('data_count_t','metric_t')
output_result_5 <- cbind(output_5,output_t_aux_5)
gg <- ggplot(data = output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Data training partitions: cold start study (simple Random Forest)

Iteration #1

rf_output_1 <- rf_result_1$output
rf_output_t_1 <- rf_result_1$output_t
rf_output_1
gg <- ggplot(data = rf_output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_1[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))

rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_1 <- rf_result_1$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_1 <- rf_output_t_1
names(rf_output_t_aux_1) <- c('data_count_t','metric_t')
rf_output_result_1 <- cbind(rf_output_1,rf_output_t_aux_1)
gg <- ggplot(data = rf_output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #2

rf_output_2 <- rf_result_2$output
rf_output_t_2 <- rf_result_2$output_t
rf_output_2
gg <- ggplot(data = rf_output_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_2[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))

rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_2 <- rf_output_t_2
names(rf_output_t_aux_2) <- c('data_count_t','metric_t')
rf_output_result_2 <- cbind(rf_output_2,rf_output_t_aux_2)
gg <- ggplot(data = rf_output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #3

rf_output_3 <- rf_result_3$output
rf_output_t_3 <- rf_result_3$output_t
rf_output_3
gg <- ggplot(data = rf_output_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_3[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))

rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_3 <- rf_output_t_3
names(rf_output_t_aux_3) <- c('data_count_t','metric_t')
rf_output_result_3 <- cbind(rf_output_3,rf_output_t_aux_3)
gg <- ggplot(data = rf_output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #4

rf_output_4 <- rf_result_4$output
rf_output_t_4 <- rf_result_4$output_t
rf_output_4
gg <- ggplot(data = rf_output_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_4[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))

rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_4 <- rf_output_t_4
names(rf_output_t_aux_4) <- c('data_count_t','metric_t')
rf_output_result_4 <- cbind(rf_output_4,rf_output_t_aux_4)
gg <- ggplot(data = rf_output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Iteration #5

rf_output_5 <- rf_result_5$output
rf_output_t_5 <- rf_result_5$output_t
rf_output_5
gg <- ggplot(data = rf_output_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_5[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))

rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result
gg <- ggplot(data = rf_output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_5 <- rf_output_t_5
names(rf_output_t_aux_5) <- c('data_count_t','metric_t')
rf_output_result_5 <- cbind(rf_output_5,rf_output_t_aux_5)
gg <- ggplot(data = rf_output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

Studies Samples

first_training_sample <- training.sampled[1:200,]
first_training_sample
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

part 2

set.seed(206)
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
size_training <- nrow(training)
split_size_training = size_training / 200
count_random <- foreach(i=1:split_size_training) %dopar% {
  200 * i
}
training.sampled <- training[sample(size_training, size_training), ]
metric <- foreach(i=1:split_size_training) %do% {
  #library(caret)
  count <- 200 * i
  aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
  clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
  aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
  result_vector <- numeric(nrow(testing))
  for (j in c(1:3)){
    cluster_data <- filter(aux_training_set_cluster, cluster == j)
    new_rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
               data = cluster_data,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)
    predsrfprobs <- predict(new_rfFit,testing,type='prob')
    for (k in c(1:length(result_vector))){
      if(predsrfprobs$Botnet[k] > 0.5){
        result_vector[k] <- result_vector[k] + 1
      }
      else{
        result_vector[k] <- result_vector[k] - 1
      }
    }
    
  }
  a = ifelse(result_vector > 0,'Botnet','Normal')
  cm <- confusionMatrix(a,testing$class)
  metric <- cm$byClass['F1']#cm$overall[1]
  metric
  
}

output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
output
gg <- ggplot(data = output)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="Accuracy", 
       color=NULL)
cluster_data

Test with only one

set.seed(226)
size_training <- nrow(training)
training.sampled <- training[sample(size_training, size_training), ]

aux_training_set <- training.sampled[c(1:200), ]#training[sample(size_training, 200), ]
clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
result_vector <- numeric(nrow(testing))
result_vector_trainning <- numeric(nrow(aux_training_set))

for (j in c(1:3)){
  cluster_data <- aux_training_set_cluster %>% filter(cluster == j)
  new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
               data = cluster_data,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)
  predsrfprobs <- predict(new_rfFit,testing,type='prob')
  for (k in c(1:length(result_vector))){
    if(predsrfprobs$botnet[k] > 0.5){
      result_vector[k] <- result_vector[k] + 1
    }
    else{
      result_vector[k] <- result_vector[k] - 1
    }
  }
  
  #Trainning predict
  predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
  for (k in c(1:length(result_vector_trainning))){
    if(predsrfprobs_t$botnet[k] > 0.5){
      result_vector_trainning[k] <- result_vector_trainning[k] + 1
    }
    else{
      result_vector_trainning[k] <- result_vector_trainning[k] - 1
    }
  }
}

a = ifelse(result_vector > 0,'botnet','normal')
b <- ifelse(result_vector_trainning > 0,'botnet','normal')
cm <- confusionMatrix(a,testing$subclass)
metric <- cm$byClass['F1']#cm$overall[1]
metric
cm_t <- confusionMatrix(b,aux_training_set$subclass)
metric_t <- cm_t$byClass['F1']
metric_t

Sample examples

set.seed(556)
a = c(1,2,3,4,5,6,7,8,9)
r <- sample(9,3)
a[r]
r2 <- sample(9,3)
a[r2]
#testing_result
testing_result.bkp <- testing_result
testing_result
names_aux <- foreach(i=1:(nrow(training)/200)) %do% {
    iteration <- 200 * i
    paste('size_',toString(iteration),sep = "")
}
testing_result_names <- unlist(names_aux, use.names=FALSE)
testing_result <- testing_result[,c(-1)]
names(testing_result) <- testing_result_names
testing_result

testing_aux <- cbind(testing,testing_result)
testing_aux.bkp2 <- testing_aux
#write.table(testing_aux,file="testing_cluster_result.txt",sep="|", row.names = F)
testing_aux
sums <- rowSums(testing_aux[,-c(1:14)])
sums
testing_aux[,-c(1:14)]
testing_aux <- cbind(testing_aux,sums)
testing_aux
testing_aux_result <- testing_aux %>% group_by(class) %>% summarise(n = n(), sums = sum(sums)) %>% arrange(desc(sums))
testing_aux_result

graph_testing_result <- ggplot(testing_aux_result[-c(1,nrow(testing_aux_result)),])
graph_testing_result + geom_point(aes(class,sums)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

feature_vectors_cleaned

library(gridExtra)
pdf("data_output.pdf", height=11, width=8.5)
grid.table(feature_vectors_cleaned[1:20,])
dev.off()

testing_result.bkp
testing_aux.bkp2
testing_aux_result

rusty_data_result <- testing_aux.bkp2
rusty_data_result_short <- rusty_data_result[,-c(1:11,14)]
rusty_data_result_short[,-c(1,2)]
rusty_data_result_short$pos <- rowSums(rusty_data_result_short[,-c(1,2)] > 0)
rusty_data_result_short$neg <- rowSums(rusty_data_result_short[,-c(1,2)] < 0)
rusty_data_result_short_cleaned <- rusty_data_result_short[,c(1,2,46,47)]
rusty_data_result_short_cleaned
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned %>% mutate(good = ifelse(subclass == 'normal',neg,pos))
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned_result %>% mutate(bad = ifelse(subclass == 'normal',pos,neg))
rusty_data_result_short_cleaned_result %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))

data_botnet_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'botnet')
data_normal_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'normal')
data_botnet_port_result <-  data_botnet_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_normal_port_result <- data_normal_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_botnet_port_result
data_normal_port_result

ggplot(data = data_botnet_port_result) + 
  geom_bar(mapping = aes(x = port, fill = clarity))

#write.table(data_botnet_port_result,file="data_botnet_port.txt",sep="|", row.names = F)
library(reshape2)
data <- data_botnet_port_result
data$port <- as.factor(data$port)

melt(data[,c(1,3,4)])

ggplot(melt(data[,c(1,3,4)]))+
  geom_col(aes(x=port,y=value,fill=variable))+
  #theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Making Noisy data (training_noisy: dataset to train with 20% of noisy)

set.seed(101) 
training.bkp <- training
noisy_data <- training

porcent <- nrow(training) / 5
training_noisy <- generate_data_noisy(noisy_data,porcent)
nrow(training)
nrow(training_noisy)
cm$overall[1]
 Accuracy 
0.9222042 

Robustness Analisys(one simple iteration)

rf_measures_result
 [1]         NA 0.92433697         NA 0.92199688         NA 0.92043682         NA 0.91341654
 [9]         NA 0.91809672         NA 0.91107644         NA 0.91575663         NA 0.91809672
[17]         NA 0.90951638         NA 0.91185647         NA 0.91107644         NA 0.90249610
[25]         NA 0.90405616         NA 0.90171607         NA 0.87051482         NA 0.88845554
[33]         NA 0.86739470         NA 0.84243370         NA 0.85179407         NA 0.78549142
[41]         NA 0.72074883         NA 0.74726989         NA 0.66458658         NA 0.58346334
[49]         NA 0.48673947         NA 0.40483619         NA 0.30109204         NA 0.22308892
[57]         NA 0.24258970         NA 0.18798752         NA 0.14664587         NA 0.13182527
[65]         NA 0.12792512         NA 0.12090484         NA 0.12792512         NA 0.11310452
[73]         NA 0.09438378         NA 0.09438378         NA 0.09282371         NA 0.08346334
[81]         NA 0.09516381         NA 0.09438378         NA 0.08658346         NA 0.08112324
[89]         NA 0.07878315

Robustness Analisys: ploting accuracy

ggplotly(g)
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
`geom_smooth()` using method = 'loess'

Robustness Analisys: ploting ELA measure

index <- seq(2,90,2)
measure_result <- ela_measures_result[index]
ela_measure_data <- data.frame(index,measure_result)
names(ela_measure_data) <- c('noise_porcent','ela_measure')

ggplot(ela_measure_data) + geom_point(mapping = aes(x = noise_porcent, y = ela_measure)) + geom_smooth(mapping = aes(x = noise_porcent, y = ela_measure))

Robustness Analisys(30 iterations)

Cosine Similarity

training
testing
prediction_vector <- testing[1,]
prediction_vector <- as.vector(as.matrix(prediction_vector))
result <- prediction_by_similarity(training,prediction_vector,100)
result

result <- c()
for(i in 1:nrow(testing)){
  prediction_vector <- testing[i,]
  prediction_vector <- as.vector(as.matrix(prediction_vector))
  result[i] <- prediction_by_similarity(training,prediction_vector,100)
}
vector_result <- unlist(result)
cm <- confusionMatrix(vector_result,testing$class)
cm

cs_data.result <- data.frame(rf_result_5$output$data_count)
Warning message:
In str.default(val) : 'object' does not have valid levels()
#for(i in c(1:1)){
  current_seed <- 226 #+ i
  set.seed(current_seed)
  
  size_training <- nrow(training)
  cs_training.sampled_current <- training[sample(size_training, size_training), ]
  split_size_training = size_training / 200
  metric <- numeric(split_size_training)
  for(j in 1:split_size_training){
    count <- 200 * j
    aux_training_set <- cs_training.sampled_current[c(1:count), ]
    result <- c()
    for(k in 1:nrow(testing)){
      prediction_vector <- testing[k,]
      prediction_vector <- as.vector(as.matrix(prediction_vector))
      output_result <- prediction_by_similarity(aux_training_set,prediction_vector,101)
      result[k] <- output_result
    }
    vector_result <- unlist(result)
    cm <- confusionMatrix(vector_result,testing$class)
    metric[j] <- cm$byClass['F1']
    
  }
  cs_data.result <- cbind(cs_data.result, metric)
Error in data.frame(..., check.names = FALSE) : 
  arguments imply differing number of rows: 16, 51
---
title: "CAI's experiments"
output: html_notebook
---

### Library Environment
```{r}
suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(ISLR))
suppressMessages(library(caret))
suppressMessages(library(doMC))
suppressMessages(library(plotly))
suppressMessages(library(stringr))
registerDoMC(cores=4)
```

### Load and processing data ctu13 cleaned
```{r}
myData_cleaned <- read.csv('/home/jguerra/datasets/ctu13.labeled.cleaned', stringsAsFactors = F, sep = '|')
myData_cleaned.bkp = myData_cleaned
myData_cleaned

#Periodicity
myData_cleaned = myData_cleaned %>% mutate(strong_p = str_count(State,'[a-i]'))
myData_cleaned = myData_cleaned %>% mutate(weak_p = str_count(State,'[A-I]'))
myData_cleaned = myData_cleaned %>% mutate(weak_np = str_count(State,'[r-z]'))
myData_cleaned = myData_cleaned %>% mutate(strong_np = str_count(State,'[R-Z]'))
#Duration
myData_cleaned = myData_cleaned %>% mutate(duration_s = str_count(State,'(a|A|r|R|1|d|D|u|U|4|g|G|x|X|7)'))
myData_cleaned = myData_cleaned %>% mutate(duration_m = str_count(State,'(b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8)'))
myData_cleaned = myData_cleaned %>% mutate(duration_l = str_count(State,'(c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9)'))
#Size
myData_cleaned = myData_cleaned %>% mutate(size_s = str_count(State,'[a-c]') + str_count(State,'[A-C]') + str_count(State,'[r-t]') + str_count(State,'[R-T]') + str_count(State,'[1-3]'))
myData_cleaned = myData_cleaned %>% mutate(size_m = str_count(State,'[d-f]') + str_count(State,'[D-F]') + str_count(State,'[u-w]') + str_count(State,'[U-W]') + str_count(State,'[4-6]'))
myData_cleaned = myData_cleaned %>% mutate(size_l = str_count(State,'[g-i]') + str_count(State,'[G-I]') + str_count(State,'[x-z]') + str_count(State,'[X-Z]') + str_count(State,'[7-9]'))

#Periodicity %
myData_cleaned <- myData_cleaned %>% mutate(strong_p = (strong_p / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(weak_p = (weak_p / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(strong_np = (strong_np / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(weak_np = (weak_np / modelsize))
#Duration %
myData_cleaned <- myData_cleaned %>% mutate(duration_s = (duration_s / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(duration_m = (duration_m / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(duration_l = (duration_l / modelsize))
#Size %
myData_cleaned <- myData_cleaned %>% mutate(size_s = (size_s / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(size_m = (size_m / modelsize))
myData_cleaned <- myData_cleaned %>% mutate(size_l = (size_l / modelsize))

#Making feature vectors
feature_vectors_cleaned = myData_cleaned[,c('strong_p','weak_p','weak_np','strong_np','duration_s','duration_m','duration_l','size_s','size_m','size_l','modelsize','label','class','port','proto')]
names(feature_vectors_cleaned) = c("sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","modelsize","class","subclass","port","proto")
feature_vectors_cleaned$class = factor(feature_vectors_cleaned$class)
feature_vectors_cleaned$subclass = factor(feature_vectors_cleaned$subclass)
feature_vectors_cleaned$proto = factor(feature_vectors_cleaned$proto)

feature_vectors_cleaned

```

### Removing excesive Botnet and Normal class(Making the dataset more equitable)
```{r}
feature_vectors_cleaned.bkp <- feature_vectors_cleaned
feature_vectors_cleaned %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
feature_vectors_cleaned_aux_botnet <- feature_vectors_cleaned %>% filter(class == 'Botnet-TCP-SMTP-Attempt-SPAM')
feature_vectors_cleaned_aux_normal <- feature_vectors_cleaned %>% filter(class == 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_botnet
feature_vectors_cleaned_aux_normal

feature_vectors_cleaned_aux_rest <- feature_vectors_cleaned %>% filter(class != 'Botnet-TCP-SMTP-Attempt-SPAM') %>% filter(class != 'Normal-TCP-HTTP')
feature_vectors_cleaned_aux_rest %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
aux1 <- rbind(feature_vectors_cleaned_aux_botnet[1:500,],feature_vectors_cleaned_aux_normal[1:500,])
aux1 %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
aux <- rbind(feature_vectors_cleaned_aux_rest,aux1)
aux %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
feature_vectors_cleaned <- aux
```

### Create training set and testset
```{r}
set.seed(212)
trainIndex <- createDataPartition(feature_vectors_cleaned$subclass, p=0.70, list=FALSE)
data_training <- feature_vectors_cleaned[ trainIndex,]
data_testing <- feature_vectors_cleaned[-trainIndex,]

#data_train = data_train %>% filter(length>5)
train <- upSample(x = data_training,  y = data_training$subclass, yname="class")

training <- train[,-c(11,16)]
testing <- data_testing[,-c(11)]
training
testing

nrow(training)
nrow(feature_vectors_cleaned)

```

### Training configuration
```{r}
ctrl_fast <- trainControl(method="cv", 
                     repeats=2,
                     number=10, 
                     summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=TRUE,
                     allowParallel = TRUE)  
```

### Experiment 1
## Creation of cluster and k parameters analysis
```{r}
library(factoextra)
library(cluster)
library(NbClust)
feature_vector_training = training[,-c(11,12,13,14)]
# K-means clustering
set.seed(321)
#km.res <- kmeans(feature_vector_training, 3, nstart = 25)
km.res <- kmeans(feature_vector_training, 7, nstart = 25)
# k-means group number of each observation
km.res$cluster

# Visualize k-means clusters
fviz_cluster(km.res, data = feature_vector_training, geom = "point",
             stand = FALSE, ellipse.type = "norm")
```
### Elbow analysis
```{r}
set.seed(321)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- feature_vector_training
wss <- sapply(1:k.max, 
        function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)
```
## Silhouette analysis
```{r}
set.seed(322)
k.max <- 10
data <- feature_vector_training
nrow(data)
sil <- rep(0, k.max)
# Compute the average silhouette width for 
# k = 2 to k = 15

for(i in 2:k.max){
  km.res <- kmeans(data, centers = i, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(data))
  sil[i] <- mean(ss[, 3])
}
# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

```
### Useful functions
```{r}
cold_start_data <- function(training.sampled,testing,settings){
  library(doParallel)
  cl <- makeCluster(2)
  registerDoParallel(cl)
  size_training <- nrow(training.sampled)
  split_size_training = size_training / 200
  testing_result = data.frame(numeric(nrow(testing)))
  
  count_random <- foreach(i=1:split_size_training) %dopar% {
    200 * i
  }
  metric <- numeric(split_size_training)
  metric_t <- numeric(split_size_training)
  #metric <- foreach(i=1:split_size_training) %do% {
  for(i in c(1:split_size_training)){
    #library(caret)
    #library(dplyr)
    count <- 200 * i
    aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
    clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
    aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
    result_vector <- numeric(nrow(testing))
    result_vector_trainning <- numeric(nrow(aux_training_set))
    
    for (j in c(1:3)){
      cluster_data <- dplyr::filter(aux_training_set_cluster, cluster == j)
      new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = cluster_data,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
      #Testing predict
      predsrfprobs <- predict(new_rfFit,testing,type='prob')
      
      for (k in c(1:length(result_vector))){
        if(predsrfprobs$botnet[k] > 0.5){
          result_vector[k] <- result_vector[k] + 1
        }
        else{
          result_vector[k] <- result_vector[k] - 1
        }
      }
      
      #Trainning predict
      predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
      for (k in c(1:length(result_vector_trainning))){
        if(predsrfprobs_t$botnet[k] > 0.5){
          result_vector_trainning[k] <- result_vector_trainning[k] + 1
        }
        else{
          result_vector_trainning[k] <- result_vector_trainning[k] - 1
        }
      }
    }
    a = ifelse(result_vector > 0,'botnet','normal')
    b <- ifelse(result_vector_trainning > 0,'botnet','normal')
    testing_result <- cbind(testing_result,'result' = result_vector)
    cm <- confusionMatrix(a,testing$subclass)
    metric[i] <- cm$byClass['F1']#cm$overall[1]
    
    cm_t <- confusionMatrix(b,aux_training_set$subclass)
    metric_t[i] <- cm_t$byClass['F1']
    #list('metric' = metric, 'metric_t' = metric_t)
  }
  output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
  output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
  list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}

cold_start_data_only_rf <- function(training.sampled,testing,settings){
  size_training <- nrow(training.sampled)
  split_size_training = size_training / 200
  testing_result = data.frame(numeric(nrow(testing)))
  
  count_random <- foreach(i=1:split_size_training) %dopar% {
    200 * i
  }
  metric <- numeric(split_size_training)
  metric_t <- numeric(split_size_training)
  #metric <- foreach(i=1:split_size_training) %do% {
  for(i in c(1:split_size_training)){
    #library(caret)
    #library(dplyr)
    count <- 200 * i
    aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
    new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = aux_training_set,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
    #Testing predict
    predsrfprobs <- predict(new_rfFit,testing,type='prob')
    predsrf <- ifelse(predsrfprobs$botnet >=0.5,'botnet','normal')
    cm <- confusionMatrix(predsrf,testing$subclass)
    metric[i] <- cm$byClass['F1']
    
    
    #Trainning predict
    predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
    predsrf_t <- ifelse(predsrfprobs_t$botnet >= 0.5,'botnet','normal')
    cm_t <- confusionMatrix(predsrf_t,aux_training_set$subclass)
    metric_t[i] <- cm_t$byClass['F1']
    
  }
  output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
  output_t <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric_t))
  list_result <- list('output' = output, 'output_t' = output_t, 'testing_result' = testing_result)
}

generate_data_noisy <- function(dataset, porcent){
  list_aux <- sample(nrow(dataset) ,porcent)
  noisy_data_sample <- dataset[list_aux,]
  no_noisy_data_sample <- dataset[-list_aux,]
  
  noisy_data_sample_b <- noisy_data_sample %>% filter(class == 'Botnet')
  noisy_data_sample_n <- noisy_data_sample %>% filter(class == 'Normal')
  
  noisy_data_sample_b$class <- as.character(noisy_data_sample_b$class)
  noisy_data_sample_b$class[noisy_data_sample_b$class == 'Botnet'] <- 'Normal'
  noisy_data_sample_b$class <- as.factor(noisy_data_sample_b$class)
  
  noisy_data_sample_n$class <- as.character(noisy_data_sample_n$class)
  noisy_data_sample_n$class[noisy_data_sample_n$class == 'Normal'] <- 'Botnet'
  noisy_data_sample_n$class <- as.factor(noisy_data_sample_n$class)
  
  noisy_data <- rbind(noisy_data_sample_b, noisy_data_sample_n)
  training_noisy <- rbind(no_noisy_data_sample,noisy_data)
  training_noisy <- training_noisy[sample(nrow(training_noisy),nrow(training_noisy)),]
  return(training_noisy)
}

get_ELA_measure <- function(A0, Ax){
  RLA <- abs(A0 - Ax) / A0
  FA0 <- (1 - A0) / A0
  ELA <- RLA + FA0
  return(ELA)
}

randomForest_performace <- function(training_data, testing_data){
  rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = training_data,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
  predsrfprobs <- predict(rfFit,testing_data,type='prob')
  predsrf <- ifelse(predsrfprobs$Botnet >=0.5,'Botnet','Normal')
  cm <- confusionMatrix(predsrf,testing_data$class)
  result <- cm$byClass
  return(result)
}

cosine_similarity <- function(vector, matrix){
  result <- c()
  for(i in c(1:nrow(matrix))){
    v1 <- as.numeric(vector)
    v2 <- as.numeric(as.vector(as.matrix(matrix[i,])))
    result[i] = v1 %*% v2 / sqrt(v1 %*% v1 * v2 %*% v2)
  }
  return(result)
}

prediction_by_similarity <- function(train_element, prediction_element,number_element){
  numeric_train_element <- train_element[,1:10] #Getting characteristic numeric vector
  similarity_result <- cosine_similarity(prediction_element[1:10],numeric_train_element)
  train_element$similarity_result <- similarity_result
  train_element_order_by_similarity <- train_element[order(train_element$similarity_result, decreasing = T),]
  train_element_order_by_similarity <- train_element_order_by_similarity[1:number_element,]
  aux <- train_element_order_by_similarity %>% group_by(class) %>% summarise(n=n()) %>% arrange(desc(n))
  result <- aux[1,1]
  return(result)
}

training
testing
```

### Data training partitions: cold start study
### Iteration #1
```{r}
set.seed(201)
size_training <- nrow(training)
training.sampled_1 <- training[sample(size_training, size_training), ]

result <- cold_start_data(training.sampled_1, testing, settings = ctrl_fast)
output_1 <- result$output
output_t_1 <- result$output_t
output_1

gg <- ggplot(data = output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
first_training_sample <- training.sampled_1[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result <- result$testing_result
#testing_result

gg <- ggplot(data = output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_1 <- output_t_1
names(output_t_aux_1) <- c('data_count_t','metric_t')
output_result_1 <- cbind(output_1,output_t_aux_1)
gg <- ggplot(data = output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```
### Iteration #2
```{r}
set.seed(202)
size_training <- nrow(training)
training.sampled_2 <- training[sample(size_training, size_training), ]

result_2 <- cold_start_data(training.sampled_2, testing, settings = ctrl_fast)
output_2 <- result_2$output
output_t_2 <- result_2$output_t
output_2

gg <- ggplot(data = output_2)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)
first_training_sample <- training.sampled_2[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_2 <- result$testing_result
#testing_result

gg <- ggplot(data = output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_2 <- output_t_2
names(output_t_aux_2) <- c('data_count_t','metric_t')
output_result_2 <- cbind(output_2,output_t_aux_2)
gg <- ggplot(data = output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #3
```{r}
set.seed(233)
size_training <- nrow(training)
training.sampled_3 <- training[sample(size_training, size_training), ]

result_3 <- cold_start_data(training.sampled_3, testing, settings = ctrl_fast)
output_3 <- result_3$output
output_t_3 <- result_3$output_t
output_3


gg <- ggplot(data = output_3)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)
first_training_sample <- training.sampled_3[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_3 <- result$testing_result
#testing_result

gg <- ggplot(data = output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_3 <- output_t_3
names(output_t_aux_3) <- c('data_count_t','metric_t')
output_result_3 <- cbind(output_3,output_t_aux_3)
gg <- ggplot(data = output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #4
```{r}
set.seed(204)
size_training <- nrow(training)
training.sampled_4 <- training[sample(size_training, size_training), ]

result_4 <- cold_start_data(training.sampled_4, testing, settings = ctrl_fast)
output_4 <- result_4$output
output_t_4 <- result_4$output_t
output_4

gg <- ggplot(data = output_4)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)
first_training_sample <- training.sampled_4[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_4 <- result$testing_result
#testing_result

gg <- ggplot(data = output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_4 <- output_t_4
names(output_t_aux_4) <- c('data_count_t','metric_t')
output_result_4 <- cbind(output_4,output_t_aux_4)
gg <- ggplot(data = output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #5
```{r}
set.seed(205)
size_training <- nrow(training)
training.sampled_5 <- training[sample(size_training, size_training), ]

result_5 <- cold_start_data(training.sampled_5, testing, settings = ctrl_fast)
output_5 <- result_5$output
output_t_5 <- result_5$output_t
output_5

gg <- ggplot(data = output_5)
  gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
    labs(title="Random Forest through data training size", 
         #subtitle="Drawn from Long Data format", 
         caption="Source: CTU-13", 
         y="F1 Score", 
         color=NULL)
first_training_sample <- training.sampled_5[1:200,]
ggplot(first_training_sample) + geom_bar(aes(subclass))
class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

testing_result_5 <- result$testing_result
#testing_result

gg <- ggplot(data = output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

output_t_aux_5 <- output_t_5
names(output_t_aux_5) <- c('data_count_t','metric_t')
output_result_5 <- cbind(output_5,output_t_aux_5)
gg <- ggplot(data = output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Data training partitions: cold start study (simple Random Forest)
```{r}
set.seed(211)
size_training <- nrow(training)
rf_training.sampled_1 <- training[sample(size_training, size_training), ]
rf_result_1 <- cold_start_data_only_rf(rf_training.sampled_1, testing, settings = ctrl_fast)

set.seed(222)
size_training <- nrow(training)
rf_training.sampled_2 <- training[sample(size_training, size_training), ]
rf_result_2 <- cold_start_data_only_rf(rf_training.sampled_2, testing, settings = ctrl_fast)

set.seed(223)
size_training <- nrow(training)
rf_training.sampled_3 <- training[sample(size_training, size_training), ]
rf_result_3 <- cold_start_data_only_rf(rf_training.sampled_3, testing, settings = ctrl_fast)

set.seed(224)
size_training <- nrow(training)
rf_training.sampled_4 <- training[sample(size_training, size_training), ]
rf_result_4 <- cold_start_data_only_rf(rf_training.sampled_4, testing, settings = ctrl_fast)

set.seed(225)
size_training <- nrow(training)
rf_training.sampled_5 <- training[sample(size_training, size_training), ]
rf_result_5 <- cold_start_data_only_rf(rf_training.sampled_5, testing, settings = ctrl_fast)

rf_data.result <- data.frame(rf_result_5$output$data_count)
rf_data.result_t <- data.frame(rf_result_5$output$data_count)
for(i in c(1:30)){
  current_seed <- 226 + i
  set.seed(current_seed)
  #size_training <- nrow(training)
  rf_training.sampled_current <- training[sample(size_training, size_training), ]
  rf_result_current <- cold_start_data_only_rf(rf_training.sampled_current, testing, settings = ctrl_fast)
  
  rf_data.result <- cbind(rf_data.result,rf_result_current$output$metric)
  rf_data.result_t <- cbind(rf_data.result_t,rf_result_current$output_t$metric)
}
 
 x <- c('count_of_data')
 for(i in c(1:30)){
   x[i+1] <- paste('iteration_',toString(i),sep = "")
 }
 x
 names(rf_data.result) <- x
 names(rf_data.result_t) <- x
 rf_data.result
 rf_data.result_t 
 
 #write.table(rf_data.result,file="random_forest_30_iterations_f1_testing.txt",sep="|", row.names = F)
 #write.table(rf_data.result_t,file="random_forest_30_iterations_f1_training.txt",sep="|", row.names = F)
```

### Iteration #1
```{r}
rf_output_1 <- rf_result_1$output
rf_output_t_1 <- rf_result_1$output_t
rf_output_1

gg <- ggplot(data = rf_output_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_1[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_1 <- rf_result_1$testing_result
#testing_result

gg <- ggplot(data = rf_output_t_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_1 <- rf_output_t_1
names(rf_output_t_aux_1) <- c('data_count_t','metric_t')
rf_output_result_1 <- cbind(rf_output_1,rf_output_t_aux_1)
gg <- ggplot(data = rf_output_result_1)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #2
```{r}
rf_output_2 <- rf_result_2$output
rf_output_t_2 <- rf_result_2$output_t
rf_output_2

gg <- ggplot(data = rf_output_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_2[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result

gg <- ggplot(data = rf_output_t_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_2 <- rf_output_t_2
names(rf_output_t_aux_2) <- c('data_count_t','metric_t')
rf_output_result_2 <- cbind(rf_output_2,rf_output_t_aux_2)
gg <- ggplot(data = rf_output_result_2)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #3
```{r}
rf_output_3 <- rf_result_3$output
rf_output_t_3 <- rf_result_3$output_t
rf_output_3

gg <- ggplot(data = rf_output_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_3[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result

gg <- ggplot(data = rf_output_t_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_3 <- rf_output_t_3
names(rf_output_t_aux_3) <- c('data_count_t','metric_t')
rf_output_result_3 <- cbind(rf_output_3,rf_output_t_aux_3)
gg <- ggplot(data = rf_output_result_3)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #4
```{r}
rf_output_4 <- rf_result_4$output
rf_output_t_4 <- rf_result_4$output_t
rf_output_4

gg <- ggplot(data = rf_output_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_4[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result

gg <- ggplot(data = rf_output_t_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_4 <- rf_output_t_4
names(rf_output_t_aux_4) <- c('data_count_t','metric_t')
rf_output_result_4 <- cbind(rf_output_4,rf_output_t_aux_4)
gg <- ggplot(data = rf_output_result_4)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```

### Iteration #5
```{r}
rf_output_5 <- rf_result_5$output
rf_output_t_5 <- rf_result_5$output_t
rf_output_5

gg <- ggplot(data = rf_output_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_first_training_sample <- rf_training.sampled_5[1:200,]
ggplot(rf_first_training_sample) + geom_bar(aes(subclass))
rf_class_distribution <- rf_first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(rf_first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#rf_testing_result_2 <- rf_result_2$testing_result
#testing_result

gg <- ggplot(data = rf_output_t_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)

rf_output_t_aux_5 <- rf_output_t_5
names(rf_output_t_aux_5) <- c('data_count_t','metric_t')
rf_output_result_5 <- cbind(rf_output_5,rf_output_t_aux_5)
gg <- ggplot(data = rf_output_result_5)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + geom_line(aes(x = data_count_t, y = metric_t),color = 'blue') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="F1 Score", 
       color=NULL)
```


### Studies Samples
```{r}
first_training_sample <- training.sampled[1:200,]
first_training_sample
ggplot(first_training_sample) + geom_bar(aes(subclass))

class_distribution <- first_training_sample %>% group_by(class) %>% summarise(n = n()) %>% arrange(desc(n))
ggplot(first_training_sample) + geom_bar(aes(class)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

```


### part 2
```{r}
set.seed(206)
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
size_training <- nrow(training)
split_size_training = size_training / 200
count_random <- foreach(i=1:split_size_training) %dopar% {
  200 * i
}
training.sampled <- training[sample(size_training, size_training), ]
metric <- foreach(i=1:split_size_training) %do% {
  #library(caret)
  count <- 200 * i
  aux_training_set <- training.sampled[c(1:count), ]#training[sample(size_training, count), ]
  clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
  aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
  result_vector <- numeric(nrow(testing))
  for (j in c(1:3)){
    cluster_data <- filter(aux_training_set_cluster, cluster == j)
    new_rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
               data = cluster_data,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)
    predsrfprobs <- predict(new_rfFit,testing,type='prob')
    for (k in c(1:length(result_vector))){
      if(predsrfprobs$Botnet[k] > 0.5){
        result_vector[k] <- result_vector[k] + 1
      }
      else{
        result_vector[k] <- result_vector[k] - 1
      }
    }
    
  }
  a = ifelse(result_vector > 0,'Botnet','Normal')
  cm <- confusionMatrix(a,testing$class)
  metric <- cm$byClass['F1']#cm$overall[1]
  metric
  
}

output <- do.call(rbind, Map(data.frame, data_count=count_random, metric=metric))
output
gg <- ggplot(data = output)
gg + geom_line(aes(x = data_count, y = metric),color = 'red') + 
  labs(title="Random Forest through data training size", 
       #subtitle="Drawn from Long Data format", 
       caption="Source: CTU-13", 
       y="Accuracy", 
       color=NULL)
cluster_data
```


### Test with only one
```{r}
set.seed(226)
size_training <- nrow(training)
training.sampled <- training[sample(size_training, size_training), ]

aux_training_set <- training.sampled[c(1:200), ]#training[sample(size_training, 200), ]
clusters <- kmeans(aux_training_set[,-c(11,12,13,14)],3,nstart = 25)
aux_training_set_cluster <- cbind(aux_training_set, cluster = clusters$cluster)
result_vector <- numeric(nrow(testing))
result_vector_trainning <- numeric(nrow(aux_training_set))

for (j in c(1:3)){
  cluster_data <- aux_training_set_cluster %>% filter(cluster == j)
  new_rfFit <- train(subclass ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
               data = cluster_data,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)
  predsrfprobs <- predict(new_rfFit,testing,type='prob')
  for (k in c(1:length(result_vector))){
    if(predsrfprobs$botnet[k] > 0.5){
      result_vector[k] <- result_vector[k] + 1
    }
    else{
      result_vector[k] <- result_vector[k] - 1
    }
  }
  
  #Trainning predict
  predsrfprobs_t <- predict(new_rfFit,aux_training_set,type='prob')
  for (k in c(1:length(result_vector_trainning))){
    if(predsrfprobs_t$botnet[k] > 0.5){
      result_vector_trainning[k] <- result_vector_trainning[k] + 1
    }
    else{
      result_vector_trainning[k] <- result_vector_trainning[k] - 1
    }
  }
}

a = ifelse(result_vector > 0,'botnet','normal')
b <- ifelse(result_vector_trainning > 0,'botnet','normal')
cm <- confusionMatrix(a,testing$subclass)
metric <- cm$byClass['F1']#cm$overall[1]
metric
cm_t <- confusionMatrix(b,aux_training_set$subclass)
metric_t <- cm_t$byClass['F1']
metric_t
```

### Sample examples
```{r}
set.seed(556)
a = c(1,2,3,4,5,6,7,8,9)
r <- sample(9,3)
a[r]
r2 <- sample(9,3)
a[r2]
```

```{r}
#testing_result
testing_result.bkp <- testing_result
testing_result
names_aux <- foreach(i=1:(nrow(training)/200)) %do% {
    iteration <- 200 * i
    paste('size_',toString(iteration),sep = "")
}
testing_result_names <- unlist(names_aux, use.names=FALSE)
testing_result <- testing_result[,c(-1)]
names(testing_result) <- testing_result_names
testing_result

testing_aux <- cbind(testing,testing_result)
testing_aux.bkp2 <- testing_aux
#write.table(testing_aux,file="testing_cluster_result.txt",sep="|", row.names = F)
testing_aux
sums <- rowSums(testing_aux[,-c(1:14)])
sums
testing_aux[,-c(1:14)]
testing_aux <- cbind(testing_aux,sums)
testing_aux
testing_aux_result <- testing_aux %>% group_by(class) %>% summarise(n = n(), sums = sum(sums)) %>% arrange(desc(sums))
testing_aux_result

graph_testing_result <- ggplot(testing_aux_result[-c(1,nrow(testing_aux_result)),])
graph_testing_result + geom_point(aes(class,sums)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

feature_vectors_cleaned

library(gridExtra)
pdf("data_output.pdf", height=11, width=8.5)
grid.table(feature_vectors_cleaned[1:20,])
dev.off()

testing_result.bkp
testing_aux.bkp2
testing_aux_result

rusty_data_result <- testing_aux.bkp2
rusty_data_result_short <- rusty_data_result[,-c(1:11,14)]
rusty_data_result_short[,-c(1,2)]
rusty_data_result_short$pos <- rowSums(rusty_data_result_short[,-c(1,2)] > 0)
rusty_data_result_short$neg <- rowSums(rusty_data_result_short[,-c(1,2)] < 0)
rusty_data_result_short_cleaned <- rusty_data_result_short[,c(1,2,46,47)]
rusty_data_result_short_cleaned
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned %>% mutate(good = ifelse(subclass == 'normal',neg,pos))
rusty_data_result_short_cleaned_result <- rusty_data_result_short_cleaned_result %>% mutate(bad = ifelse(subclass == 'normal',pos,neg))
rusty_data_result_short_cleaned_result %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))

data_botnet_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'botnet')
data_normal_port <- rusty_data_result_short_cleaned_result %>% filter(subclass == 'normal')
data_botnet_port_result <-  data_botnet_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_normal_port_result <- data_normal_port %>% group_by(port) %>% summarise(n=n(),good = sum(good),bad = sum(bad)) %>% arrange(desc(n))
data_botnet_port_result
data_normal_port_result

ggplot(data = data_botnet_port_result) + 
  geom_bar(mapping = aes(x = port, fill = clarity))

#write.table(data_botnet_port_result,file="data_botnet_port.txt",sep="|", row.names = F)
library(reshape2)
data <- data_botnet_port_result
data$port <- as.factor(data$port)

melt(data[,c(1,3,4)])

ggplot(melt(data[,c(1,3,4)]))+
  geom_col(aes(x=port,y=value,fill=variable))+
  #theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

#Making Noisy data (training_noisy: dataset to train with 20% of noisy)
```{r}
set.seed(101) 
training.bkp <- training
noisy_data <- training

porcent <- nrow(training) / 5
training_noisy <- generate_data_noisy(noisy_data,porcent)
nrow(training)
nrow(training_noisy)
```

```{r}
rfFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl,
                 data = training,
                 metric="ROC",
                 method = "rf",
                 trControl = settings)
predsrfprobs <- predict(rfFit,testing,type='prob')
predsrf <- ifelse(predsrfprobs$Botnet >=0.5,'Botnet','Normal')
cm <- confusionMatrix(predsrf,testing$class)
result <- cm$byClass[11]
cm$overall[1]
```

#Robustness Analisys(one simple iteration)
```{r}
set.seed(321)
partitions <- seq(2,90,2)
rf_measures_result <- c()
ela_measures_result <- c()
total <- nrow(training)
balanced_accuracy <- randomForest_performace(training,testing)

for(i in partitions){
  porcent <- (i*total) / 100
  training_noisy <- generate_data_noisy(noisy_data,porcent)
  balanced_accuracy_aux <- randomForest_performace(training_noisy,testing)
  rf_measures_result[i] <- balanced_accuracy_aux
}

for(i in partitions){
  balanced_accuracy_noisy <- rf_measures_result[i]
  ela_measures_result[i] <- get_ELA_measure(A0 = balanced_accuracy['Balanced Accuracy'], Ax = balanced_accuracy_noisy)
}

plot(ela_measures_result)
 plot(rf_measures_result)
```

#Robustness Analisys: ploting accuracy
```{r}
index <- seq(2,90,2)
measure_result <- rf_measures_result[index]
rla_measure_data <- data.frame(index,measure_result)
names(rla_measure_data) <- c('noise_porcent','balanced_accuracy')

g<-ggplot(rla_measure_data) + geom_point(mapping = aes(x = noise_porcent, y = balanced_accuracy)) + geom_smooth(mapping = aes(x = noise_porcent, y = balanced_accuracy))
ggplotly(g)
```

#Robustness Analisys: ploting ELA measure
```{r}
index <- seq(2,90,2)
measure_result <- ela_measures_result[index]
ela_measure_data <- data.frame(index,measure_result)
names(ela_measure_data) <- c('noise_porcent','ela_measure')

ggplot(ela_measure_data) + geom_point(mapping = aes(x = noise_porcent, y = ela_measure)) + geom_smooth(mapping = aes(x = noise_porcent, y = ela_measure))
```

#Robustness Analisys(30 iterations)
```{r}
set.seed(331)
partitions <- seq(2,90,2)

total <- nrow(training)
result_rf_accuracy <- data.frame('count_of_data' = partitions)
result_ela_measure <- data.frame('count_of_data' = partitions)
write.table(result_rf_accuracy,file="test_result_rf_accuracy.txt",sep="|", row.names = F)
write.table(result_ela_measure,file="test_result_ela_measure.txt",sep="|", row.names = F)
for (j in c(1:30)){
  rf_measures_result <- c()
  ela_measures_result <- c()
  #Calculando RF performance con variaciones en el porciento de ruido en training
  for(i in partitions){
    porcent <- (i*total) / 100
    training_noisy <- generate_data_noisy(noisy_data,porcent)
    balanced_accuracy_aux <- randomForest_performace(training_noisy,testing)
    rf_measures_result[i] <- balanced_accuracy_aux
  }
  #Calculando ELA measure
  for(i in partitions){
    balanced_accuracy_noisy <- rf_measures_result[i]
    ela_measures_result[i] <- get_ELA_measure(A0 = balanced_accuracy['Balanced Accuracy'], Ax = balanced_accuracy_noisy)
  }
  
  #Quedandome con las posiciones pares que son las que tienen los valores
  ela_measures_result <- ela_measures_result[partitions]
  rf_measures_result <- rf_measures_result[partitions]
  
  colum_name <- paste('iteration_',toString(j),sep = "")
  result_rf_accuracy <- cbind(result_rf_accuracy, colum_name = rf_measures_result)
  result_ela_measure <- cbind(result_ela_measure, colum_name = ela_measures_result)
  
  write.table(result_rf_accuracy,file="test_result_rf_accuracy.txt",sep="|", row.names = F)
  write.table(result_ela_measure,file="test_result_ela_measure.txt",sep="|", row.names = F)
}

result_ela_measure
result_rf_accuracy

```

```{r, fig.height=8}
x <- c('nosisy_porcent')
for(i in c(1:30)){
 x[i+1] <- paste('iteration_',toString(i),sep = "")
}
x
result_ela_measure.bkp <- result_ela_measure
result_rf_accuracy.bkp <- result_rf_accuracy
names(result_ela_measure) <- x
names(result_rf_accuracy) <- x

result_rf_accuracy
ggplot(result_rf_accuracy %>% gather("iteration","value",2:31))+
  geom_boxplot(aes(x=as.factor(nosisy_porcent),y=value))+
  stat_summary(fun.y= "mean",
                 aes(x=as.factor(nosisy_porcent),y=value,color=value))
  #scale_color_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0.5)
```

#Cosine Similarity
```{r}
training
testing
prediction_vector <- testing[1,]
prediction_vector <- as.vector(as.matrix(prediction_vector))
result <- prediction_by_similarity(training,prediction_vector,100)
result

result <- c()
for(i in 1:nrow(testing)){
  prediction_vector <- testing[i,]
  prediction_vector <- as.vector(as.matrix(prediction_vector))
  result[i] <- prediction_by_similarity(training,prediction_vector,100)
}
vector_result <- unlist(result)
cm <- confusionMatrix(vector_result,testing$class)
cm
```

####################### 
```{r}
cs_data.result <- data.frame(rf_result_5$output$data_count)
#for(i in c(1:1)){
  current_seed <- 226 #+ i
  set.seed(current_seed)
  
  size_training <- nrow(training)
  cs_training.sampled_current <- training[sample(size_training, size_training), ]
  split_size_training = size_training / 200
  metric <- numeric(split_size_training)
  for(j in 1:split_size_training){
    count <- 200 * j
    aux_training_set <- cs_training.sampled_current[c(1:count), ]
    result <- c()
    for(k in 1:nrow(testing)){
      prediction_vector <- testing[k,]
      prediction_vector <- as.vector(as.matrix(prediction_vector))
      output_result <- prediction_by_similarity(aux_training_set,prediction_vector,101)
      result[k] <- output_result
    }
    vector_result <- unlist(result)
    cm <- confusionMatrix(vector_result,testing$class)
    metric[j] <- cm$byClass['F1']
    
  }
  cs_data.result <- cbind(cs_data.result, metric)
#}
cs_data.result
```

```{r}

```